import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline
#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
df = pd.read_excel('TRAFFIC ACCIDENTS DATA.xlsx')
df
| Date | Accident Spot | Area | County | Road/ Highway | Brief Accident Details/Cause | Victims | Total people confirmed dead | Time of the Accidents | Weather conditions | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-08-08 | Sobea | Sobea | Nakuru | Nakuru-Eldoret Highway | Head on Collision | Passengers | 4 | 4.30 pm | NaN |
| 1 | 2023-08-07 | Maai-Mahiu | Naivasha | Nakuru | Maai-Mahiu Naivasha Highway | vehicle and motorcycle collision | Passengers | 1 | 5.50 pm | NaN |
| 2 | 2023-07-25 | Ntulele | Ntulele | Narok | Narok Mai Mahiu road | Head on Collision | Drivers/Occupants | 4 | NaN | NaN |
| 3 | 2022-12-02 | Suswa | Suswa | Narok | Narok Mai Mahiu road | Head on Collision | Driver and passengers | 3 | 6.00 pm | NaN |
| 4 | 2022-12-01 | Mutira | Mutira | Kirinyaga | Kerugoya-Karatina Road | Run over | Pedestrian | 1 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 76 | 2022-06-03 | Losengeli | Sabatia | Vihiga | Nairobi Kakamega road | Bus lost control and rolled over | Passengers | 1 | NaN | NaN |
| 77 | 2022-05-31 | Isinya | Isinya | Kajiado | Namanga Road | Car rammed into a trailer | Passengers | 1 | 11.00 pm | NaN |
| 78 | 2022-05-30 | Kinugi | Kinugi | Nakuru | Naivasha Nairobi Highway | Truck driver collided with an oncoming matatu | Passengers | 1 | NaN | NaN |
| 79 | 2022-02-28 | Kinugi | Kinugi | Nakuru | Nakuru - Nairobi highway | Car rammed into a lorry | Passengers | 4 | NaN | NaN |
| 80 | 2023-08-08 | Sobea | Sobea | Nakuru | Nakuru Eldoret Highway | matatu collided with a truck | Passengers | 2 | 4.30 am | NaN |
81 rows × 10 columns
Columns and Description¶
- Date - Contains dates of the accidents
- Accident Spot - Describes the specific location of the accident
- Area - General area where the accident occurred.
- County - County in which the accident happened.
- Road/ Highway - Road or highway where the accident occurred.
- Brief Accident Details/Cause - Short description of the accident cause or details.
- Victims - Specifies the type of victims involved (e.g., passengers, drivers).
- Total people confirmed dead - Indicates the number of fatalities.
- Time of the Accidents - The time when the accidents occurred.
- Weather conditions - Details about weather conditions during the accidents.
Preparing the dataset¶
df.columns
Index(['Date', 'Accident Spot', 'Area', 'County', 'Road/ Highway',
'Brief Accident Details/Cause', 'Victims',
'Total people confirmed dead', 'Time of the Accidents',
'Weather conditions'],
dtype='object')
# make column names and values uniform
df.rename(columns={'Road/ Highway': 'Road/Highway'}, inplace=True) # renaming a column
df[ 'Total people confirmed dead'] = df[ 'Total people confirmed dead'].apply(pd.to_numeric, errors='coerce')
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = df.dtypes[df.dtypes == 'object'].index
for c in categorical_columns:
df[c] = df[c].str.lower().str.replace(' ', '_')
df
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | weather_conditions | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-08-08 | sobea | sobea | nakuru | nakuru-eldoret_highway | head_on_collision | passengers | 4.0 | 4.30_pm | NaN |
| 1 | 2023-08-07 | maai-mahiu | naivasha | nakuru | maai-mahiu_naivasha_highway | vehicle_and_motorcycle_collision | passengers | 1.0 | 5.50_pm | NaN |
| 2 | 2023-07-25 | ntulele | ntulele | narok | narok_mai_mahiu_road | head_on_collision | drivers/occupants | 4.0 | NaN | NaN |
| 3 | 2022-12-02 | suswa | suswa | narok | narok_mai_mahiu_road | head_on_collision | driver_and_passengers | 3.0 | 6.00_pm | NaN |
| 4 | 2022-12-01 | mutira | mutira | kirinyaga | kerugoya-karatina_road | run_over | pedestrian | 1.0 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 76 | 2022-06-03 | losengeli | sabatia | vihiga | nairobi_kakamega_road | bus_lost_control_and_rolled_over | passengers | 1.0 | NaN | NaN |
| 77 | 2022-05-31 | isinya | isinya | kajiado | namanga_road | car_rammed_into_a_trailer | passengers | 1.0 | 11.00_pm | NaN |
| 78 | 2022-05-30 | kinugi | kinugi | nakuru | naivasha_nairobi_highway | truck_driver_collided_with_an_oncoming_matatu | passengers | 1.0 | NaN | NaN |
| 79 | 2022-02-28 | kinugi | kinugi | nakuru | nakuru_-_nairobi_highway_ | car_rammed_into_a_lorry | passengers | 4.0 | NaN | NaN |
| 80 | 2023-08-08 | sobea | sobea | nakuru | nakuru_eldoret_highway | matatu_collided_with_a_truck | passengers | 2.0 | 4.30_am | NaN |
81 rows × 10 columns
df.dtypes
date datetime64[ns] accident_spot object area object county object road/highway object brief_accident_details/cause object victims object total_people_confirmed_dead float64 time_of_the_accidents object weather_conditions object dtype: object
# converting specified columns
# Convert to integer
df['total_people_confirmed_dead'] = df['total_people_confirmed_dead'].astype('Int64')
# Convert to 24-hour time format
# Step 1: Replace time strings to make them compatible
df['time_of_the_accidents'] = df['time_of_the_accidents'].replace(
r'(\d{1,2})\.(\d{1,2})_?([ap]m)', # Allow missing underscore
r'\1:\2 \3',
regex=True
)
# Step 2: Convert to datetime, handling AM/PM
df['time_of_the_accidents'] = pd.to_datetime(
df['time_of_the_accidents'], format='%I:%M %p', errors='coerce'
)
# Step 3: Extract only the time in HH:MM format
df['time_of_the_accidents'] = df['time_of_the_accidents'].dt.strftime('%H:%M')
df
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | weather_conditions | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-08-08 | sobea | sobea | nakuru | nakuru-eldoret_highway | head_on_collision | passengers | 4 | 16:30 | NaN |
| 1 | 2023-08-07 | maai-mahiu | naivasha | nakuru | maai-mahiu_naivasha_highway | vehicle_and_motorcycle_collision | passengers | 1 | 17:50 | NaN |
| 2 | 2023-07-25 | ntulele | ntulele | narok | narok_mai_mahiu_road | head_on_collision | drivers/occupants | 4 | NaN | NaN |
| 3 | 2022-12-02 | suswa | suswa | narok | narok_mai_mahiu_road | head_on_collision | driver_and_passengers | 3 | 18:00 | NaN |
| 4 | 2022-12-01 | mutira | mutira | kirinyaga | kerugoya-karatina_road | run_over | pedestrian | 1 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 76 | 2022-06-03 | losengeli | sabatia | vihiga | nairobi_kakamega_road | bus_lost_control_and_rolled_over | passengers | 1 | NaN | NaN |
| 77 | 2022-05-31 | isinya | isinya | kajiado | namanga_road | car_rammed_into_a_trailer | passengers | 1 | 23:00 | NaN |
| 78 | 2022-05-30 | kinugi | kinugi | nakuru | naivasha_nairobi_highway | truck_driver_collided_with_an_oncoming_matatu | passengers | 1 | NaN | NaN |
| 79 | 2022-02-28 | kinugi | kinugi | nakuru | nakuru_-_nairobi_highway_ | car_rammed_into_a_lorry | passengers | 4 | NaN | NaN |
| 80 | 2023-08-08 | sobea | sobea | nakuru | nakuru_eldoret_highway | matatu_collided_with_a_truck | passengers | 2 | 04:30 | NaN |
81 rows × 10 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 81 entries, 0 to 80 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 80 non-null datetime64[ns] 1 accident_spot 81 non-null object 2 area 81 non-null object 3 county 81 non-null object 4 road/highway 80 non-null object 5 brief_accident_details/cause 79 non-null object 6 victims 63 non-null object 7 total_people_confirmed_dead 71 non-null Int64 8 time_of_the_accidents 25 non-null object 9 weather_conditions 1 non-null object dtypes: Int64(1), datetime64[ns](1), object(8) memory usage: 6.5+ KB
Data Cleaning¶
# duplicates
df.duplicated().sum()
1
#dropping duplicates
df = df.drop_duplicates()
df.duplicated().sum()
0
# nulls
df.isna().sum()
date 1 accident_spot 0 area 0 county 0 road/highway 1 brief_accident_details/cause 2 victims 17 total_people_confirmed_dead 9 time_of_the_accidents 55 weather_conditions 79 dtype: int64
(df['weather_conditions'].isna().sum()/len(df)) * 100
98.75
Approximately 99% of the 'weather_conditions' column is null hence safe to drop this column
# dropping 'weather_conditions' column
del df['weather_conditions']
# date
df[df['date'].isna()]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 66 | NaT | ndarugo | ndarugo | kiambu | thika_road_ | truck_lost_control_and_overturned | NaN | 0 | 10:00 |
df.iloc[60:70]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 61 | 2023-04-01 | amabuko | kroka_town | kisii | kisii_keroka_road | matatu_collided_with_a_trailer | passengers | 5 | NaN |
| 62 | 2023-03-30 | kayole_bridge | nakuru | nakuru | nakuru_-_nairobi_highway_ | bus_collided_with_a_matatu | passengers | 18 | NaN |
| 63 | 2023-03-31 | burguret_shopping_centre | burguret | muranga | naromoru_nanyuki_road | bus_overturned | passengers | 0 | NaN |
| 64 | 2023-04-18 | maragua | maragua | muranga | kenol_muranga_road | matatu_collided_with_a_lorry | passengers | 2 | NaN |
| 65 | 2023-08-08 | migori_town | migori_town | migori | migori_isibania_road | truck_failed_brakes_and_ran_into_ither_vehicles | pedestrians | 8 | 07:30 |
| 66 | NaT | ndarugo | ndarugo | kiambu | thika_road_ | truck_lost_control_and_overturned | NaN | 0 | 10:00 |
| 67 | 2023-02-04 | kakwamunyen | kakuma | turkana | lodwar_kakuma_road | driver_swerved_to_avoid_on_hitting_a_camel_on_... | passengers | 14 | 22:00 |
| 68 | 2023-01-29 | mamboleo | mamboleo | kisumu | kakamega_kisumu_road | bus_veered_off_the_road_and_landed_in_a_ditch | passengers | 0 | NaN |
| 69 | 2023-01-26 | kikopey | kikopey | nakuru | nakuru_-_nairobi_highway_ | a_matatu_rammed_into_a_lorry | passengers | 7 | NaN |
| 70 | 2023-09-08 | iibisil_towm | iibisil | kajiado | namanga_road | sallon_car_rammed_iinto_a_lorry | passengers | 3 | NaN |
# the sorrounding data for the null date is around the same period
# Use .loc to fill the 'date' column forward
df.loc[:, 'date'] = df['date'].ffill()
df.isna().sum()
date 0 accident_spot 0 area 0 county 0 road/highway 1 brief_accident_details/cause 2 victims 17 total_people_confirmed_dead 9 time_of_the_accidents 55 dtype: int64
# road/highway
df[df['road/highway'].isna()]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 72 | 2022-07-16 | koru | koru | kisumu | NaN | bus_rolled_as_it_tried_avoiding_collision_with... | passengers | 0 | 14:00 |
df[df['county'] == 'kisumu']
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 22 | 2022-06-15 | koru | muhoroni | kisumu | kisumu_highway_ | bus_lost_control_and_rolled | passengers_and_driver | <NA> | NaN |
| 24 | 2023-08-29 | ojolla | ojolla | kisumu | kisumu_-_busia_road | head_on_collision_ | passengers_and_drivers | <NA> | NaN |
| 39 | 2023-08-28 | ojola | ojola | kisumu | kisumu_-_busia_road | driver_tried_to_overtake,_rammed_into_a_traile... | passenger | 1 | NaN |
| 68 | 2023-01-29 | mamboleo | mamboleo | kisumu | kakamega_kisumu_road | bus_veered_off_the_road_and_landed_in_a_ditch | passengers | 0 | NaN |
| 71 | 2022-08-30 | coptic_round | coptic | kisumu | kisumu_kakamega_road | truck_lost_control_and_overturned | pedestrians | 3 | NaN |
| 72 | 2022-07-16 | koru | koru | kisumu | NaN | bus_rolled_as_it_tried_avoiding_collision_with... | passengers | 0 | 14:00 |
There is no much supporting data for the null 'road/highway' column hence safe to drop it
# Remove rows where 'road/highway' is null
df = df.dropna(subset=['road/highway'])
df = df.reset_index(drop=True)
df.isna().sum()
date 0 accident_spot 0 area 0 county 0 road/highway 0 brief_accident_details/cause 2 victims 17 total_people_confirmed_dead 9 time_of_the_accidents 55 dtype: int64
# brief_accident_details/cause
df[df['brief_accident_details/cause'].isna()]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 43 | 2023-04-02 | masaba | masaba | kisii | kisii_keroka_road | NaN | NaN | <NA> | NaN |
| 58 | 2023-04-21 | mau_summit | kuresoi_north | nakuru | nakuru_eldoret_highway | NaN | pedestrian | 1 | 18:30 |
df[df['road/highway'] == 'kisii_keroka_road']
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 43 | 2023-04-02 | masaba | masaba | kisii | kisii_keroka_road | NaN | NaN | <NA> | NaN |
| 60 | 2023-04-01 | amabuko | kroka_town | kisii | kisii_keroka_road | matatu_collided_with_a_trailer | passengers | 5 | NaN |
# Filter rows where road/highway is "kisii_keroka_road"
kisii_keroka_road_rows = df[df['road/highway'] == 'kisii_keroka_road']
# Use loc to apply backward fill on the slice
df.loc[kisii_keroka_road_rows.index, 'brief_accident_details/cause'] = kisii_keroka_road_rows['brief_accident_details/cause'].bfill()
df[df['road/highway'] == 'nakuru_eldoret_highway']
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 26 | 2023-07-09 | ngata_bridge | ngata | nakuru | nakuru_eldoret_highway | matatu_and_truck_head_on_collision | passenger | 1 | 22:00 |
| 30 | 2023-05-15 | sachangwan | sachangwan | bomet | nakuru_eldoret_highway | matatu_hit_a_bump,_veered_off_the_road_and_lan... | NaN | 5 | 02:00 |
| 35 | 2023-06-24 | migaa | mau_hills | nakuru | nakuru_eldoret_highway | matatu_driver_lot_ontrol_and_hit_a_lorry | driver_and_passengers | 3 | NaN |
| 37 | 2023-09-26 | ainabkoi_intersection | ainabkoi | uansin_gishi | nakuru_eldoret_highway | bus_collided_with_a_saloon_car | NaN | 0 | NaN |
| 48 | 2023-08-08 | sobea | sobea | nakuru | nakuru_eldoret_highway | matatu_collided_with_a_truck | passengers | 2 | NaN |
| 56 | 2023-06-24 | migaa | mau_hills | nakuru | nakuru_eldoret_highway | matatu_driver_lost_control_and_hit_a_lorry | passenger | 3 | NaN |
| 57 | 2023-05-15 | sachangwan | sachangwan | nakuru | nakuru_eldoret_highway | matatu_hit_a_bump_and_veered_off_the_road | passengers | 5 | 02:00 |
| 58 | 2023-04-21 | mau_summit | kuresoi_north | nakuru | nakuru_eldoret_highway | NaN | pedestrian | 1 | 18:30 |
| 78 | 2023-08-08 | sobea | sobea | nakuru | nakuru_eldoret_highway | matatu_collided_with_a_truck | passengers | 2 | 04:30 |
# Filter rows where road/highway is "knakuru_eldoret_highway"
nakuru_eldoret_highway_rows = df[df['road/highway'] == 'nakuru_eldoret_highway']
# Use loc to apply forward fill on the slice as tha cause above would fit the victim
df.loc[nakuru_eldoret_highway_rows.index, 'brief_accident_details/cause'] = nakuru_eldoret_highway_rows['brief_accident_details/cause'].ffill()
df.isna().sum()
date 0 accident_spot 0 area 0 county 0 road/highway 0 brief_accident_details/cause 0 victims 17 total_people_confirmed_dead 9 time_of_the_accidents 55 dtype: int64
# total_people_confirmed_dead
df[df['total_people_confirmed_dead'].isna()]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 12 | 2021-02-23 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | head_on_collision | NaN | <NA> | 02:00 |
| 14 | 2020-08-17 | river_maara | river_maara | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_river | NaN | <NA> | NaN |
| 21 | 2023-09-27 | sotik | sotik | bomet | bomet_-_sotik_highway | bodaboda_rider_hit_by_matatu | rider | <NA> | NaN |
| 22 | 2022-06-15 | koru | muhoroni | kisumu | kisumu_highway_ | bus_lost_control_and_rolled | passengers_and_driver | <NA> | NaN |
| 23 | 2021-08-29 | londiani | londiani | nakuru | nakuru_kericho_highway | vehicle_lost_control_nd_rolled | NaN | <NA> | NaN |
| 24 | 2023-08-29 | ojolla | ojolla | kisumu | kisumu_-_busia_road | head_on_collision_ | passengers_and_drivers | <NA> | NaN |
| 25 | 2021-08-25 | greensteads | greensteads | nakuru | nakuru_-_nairobi_highway_ | head_on_collision | passengers | <NA> | NaN |
| 27 | 2023-01-13 | ratili | ratili_narok_south | narok | narok_road | saloom_car_rolled | passangers_and_driver | <NA> | NaN |
| 43 | 2023-04-02 | masaba | masaba | kisii | kisii_keroka_road | matatu_collided_with_a_trailer | NaN | <NA> | NaN |
If we have no information on the victims, then we cant assign a number to the total people confirmed dead hence we drop rows where both victims and total confirmed dead are null
df[df['victims'].isna() & df['total_people_confirmed_dead'].isna()]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 12 | 2021-02-23 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | head_on_collision | NaN | <NA> | 02:00 |
| 14 | 2020-08-17 | river_maara | river_maara | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_river | NaN | <NA> | NaN |
| 23 | 2021-08-29 | londiani | londiani | nakuru | nakuru_kericho_highway | vehicle_lost_control_nd_rolled | NaN | <NA> | NaN |
| 43 | 2023-04-02 | masaba | masaba | kisii | kisii_keroka_road | matatu_collided_with_a_trailer | NaN | <NA> | NaN |
# Drop rows where both 'victims' and 'total_people_confirmed_dead' are null
df = df.dropna(subset=['victims', 'total_people_confirmed_dead'], how='all')
df[df['total_people_confirmed_dead'].isna()]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 21 | 2023-09-27 | sotik | sotik | bomet | bomet_-_sotik_highway | bodaboda_rider_hit_by_matatu | rider | <NA> | NaN |
| 22 | 2022-06-15 | koru | muhoroni | kisumu | kisumu_highway_ | bus_lost_control_and_rolled | passengers_and_driver | <NA> | NaN |
| 24 | 2023-08-29 | ojolla | ojolla | kisumu | kisumu_-_busia_road | head_on_collision_ | passengers_and_drivers | <NA> | NaN |
| 25 | 2021-08-25 | greensteads | greensteads | nakuru | nakuru_-_nairobi_highway_ | head_on_collision | passengers | <NA> | NaN |
| 27 | 2023-01-13 | ratili | ratili_narok_south | narok | narok_road | saloom_car_rolled | passangers_and_driver | <NA> | NaN |
df[df['victims'] == 'rider']
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 21 | 2023-09-27 | sotik | sotik | bomet | bomet_-_sotik_highway | bodaboda_rider_hit_by_matatu | rider | <NA> | NaN |
# Fill 'total_people_confirmed_dead' with 1 where 'victims' is 'rider'
df.loc[df['victims'] == 'rider', 'total_people_confirmed_dead'] = 1
# central tendancies of total people confirmed dead
max = df['total_people_confirmed_dead'].max()
min = df['total_people_confirmed_dead'].min()
mean = np.mean(df['total_people_confirmed_dead'])
mode = df['total_people_confirmed_dead'].mode()[0]
median = df['total_people_confirmed_dead'].median()
print(f"Max: {max}")
print(f"Min: {min}")
print(f"Mean: {mean}")
print(f"Mode: {mode}")
print(f"Median: {median}")
Max: 52 Min: 0 Mean: 4.957746478873239 Mode: 1 Median: 3.0
# trying to see the distribution of dead people values
px.box(df['total_people_confirmed_dead'])
The distribution has outliers hence it would not be appropriate to replace with mean but rather the median as the nulls have both passengers and driver as the victims meaning they are more than one
# Replace nulls in 'total_people_confirmed_dead' with the column's median
df['total_people_confirmed_dead'] = df['total_people_confirmed_dead'].fillna(df['total_people_confirmed_dead'].median())
df.isna().sum()
date 0 accident_spot 0 area 0 county 0 road/highway 0 brief_accident_details/cause 0 victims 13 total_people_confirmed_dead 0 time_of_the_accidents 52 dtype: int64
# victims
df['victims'].value_counts()
victims passengers 36 driver_and_passengers 4 passengers_and_driver 3 passenger 3 pedestrians 3 pedestrian 2 occupants_of_the_small_vehicle_ 2 passengers_and_drivers 2 drivers/occupants 1 passengers_and_pedestrians_ 1 drivers,passengers_and_pedestrians 1 rider 1 passangers_and_driver 1 driver_and_other_motorists_ 1 motorist 1 Name: count, dtype: int64
# Dictionary to map similar names to a unified value
mapping_dict = {
'passengers': 'passengers',
'driver_and_passengers': 'driver_and_passengers',
'passengers_and_driver': 'driver_and_passengers',
'passenger': 'passengers',
'pedestrians': 'pedestrians',
'pedestrian': 'pedestrians',
'occupants_of_the_small_vehicle_': 'occupants_of_the_small_vehicle_',
'passengers_and_drivers': 'driver_and_passengers',
'drivers/occupants': 'drivers/occupants',
'passengers_and_pedestrians_': 'passengers_and_pedestrians',
'drivers,passengers_and_pedestrians': 'driver_passenger_pedestrian',
'rider': 'motorist',
'passangers_and_driver': 'driver_and_passengers',
'driver_and_other_motorists_': 'driver_and_motorists',
'motorist': 'motorist'
}
# Replace values in the 'victims' column based on the mapping
df['victims'] = df['victims'].replace(mapping_dict)
# Check updated value counts
df['victims'].value_counts()
victims passengers 39 driver_and_passengers 10 pedestrians 5 occupants_of_the_small_vehicle_ 2 motorist 2 drivers/occupants 1 passengers_and_pedestrians 1 driver_passenger_pedestrian 1 driver_and_motorists 1 Name: count, dtype: int64
df[df['victims'].isna()]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 28 | 2023-05-22 | isinya | isinya | kajiado | isinya_kiserian_road | bus_lost_control_and_rolled_into_a_ditch | NaN | 7 | NaN |
| 30 | 2023-05-15 | sachangwan | sachangwan | bomet | nakuru_eldoret_highway | matatu_hit_a_bump,_veered_off_the_road_and_lan... | NaN | 5 | 02:00 |
| 36 | 2023-09-27 | laisamis | laisamis | isiolo | marsabit_isiolo_highway | vehicle_veered_off_rhe_road_after_tyre_burst | NaN | 2 | NaN |
| 37 | 2023-09-26 | ainabkoi_intersection | ainabkoi | uansin_gishi | nakuru_eldoret_highway | bus_collided_with_a_saloon_car | NaN | 0 | NaN |
| 42 | 2023-04-16 | josa | mwatate | taita_taveta | mombasa_-_nairobi_highway_ | head_on_collision | NaN | 10 | NaN |
| 44 | 2023-09-18 | gitaru | gitaru | kiambu | nakuru_-_nairobi_highway_ | driver_lost_control_and_hit_barrier | NaN | 0 | NaN |
| 45 | 2023-09-08 | malili | malili | makueni | mombasa_-_nairobi_highway_ | involved_a_truck,_lorry_and_saloon_car | NaN | 0 | 16:20 |
| 47 | 2023-08-30 | nairagie-enkare | narok | narok | narok_mai_mahiu_road | bus_burst_into_flames | NaN | 0 | NaN |
| 51 | 2023-07-04 | mau_summit | kuresoi_north | nakuru | nakuru_kericho_highway | bus_lost_control_and_landed_in_a_ditch | NaN | 0 | NaN |
| 52 | 2022-04-26 | kirinyaga | kirinyaga | kirinyaga | rukenya_kimunye_road | bus_overturned | NaN | 0 | NaN |
| 55 | 2021-12-03 | tuthamba | kirinyaga | kirinyaga | sagana_kagio_road | matatu_collided_head_on_with_a_saloon_car | NaN | 0 | 21:00 |
| 59 | 2023-04-18 | naivasha | naivasha | nakuru | nakuru_-_nairobi_highway_ | matatu_collided_with_a_lorry | NaN | 5 | NaN |
| 65 | 2023-08-08 | ndarugo | ndarugo | kiambu | thika_road_ | truck_lost_control_and_overturned | NaN | 0 | 10:00 |
# Get the top 2 value counts
top_2_values = df['victims'].value_counts().head(2).index
# Randomly choose from the top 2 values and fill the NaNs
df.loc[df['victims'].isna(), 'victims'] = np.random.choice(top_2_values, size=df['victims'].isna().sum())
df.isna().sum()
date 0 accident_spot 0 area 0 county 0 road/highway 0 brief_accident_details/cause 0 victims 0 total_people_confirmed_dead 0 time_of_the_accidents 52 dtype: int64
# time
df['time_of_the_accidents'].unique()
array(['16:30', '17:50', nan, '18:00', '12:30', '02:00', '16:20', '18:30',
'21:00', '22:00', '23:30', '13:00', '19:30', '07:30', '10:00',
'23:00', '04:30'], dtype=object)
mapping_dict = {
'16:30': 'afternoon',
'17:50': 'evening',
'18:00': 'evening',
'12:30': 'afternoon',
'02:00': 'night',
'16:20': 'afternoon',
'18:30': 'evening',
'21:00': 'night',
'22:00': 'night',
'23:30': 'night',
'13:00': 'afternoon',
'19:30': 'night',
'07:30': 'morning',
'10:00': 'morning',
'23:00': 'night',
'04:30': 'night'
}
# Replace values in the 'time_of_the_accidents' column based on the mapping
df['time_of_the_accidents'] = df['time_of_the_accidents'].replace(mapping_dict)
# Check updated value counts
df['time_of_the_accidents'].value_counts()
time_of_the_accidents night 12 afternoon 5 evening 4 morning 2 Name: count, dtype: int64
df[df['time_of_the_accidents'].isna()].head(10)
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 2 | 2023-07-25 | ntulele | ntulele | narok | narok_mai_mahiu_road | head_on_collision | drivers/occupants | 4 | NaN |
| 4 | 2022-12-01 | mutira | mutira | kirinyaga | kerugoya-karatina_road | run_over | pedestrians | 1 | NaN |
| 5 | 2022-08-06 | mlima_swara | mlima_swara | murang'a | kenol-sagana_road | car_hit_a_stationary_lorry | passengers | 5 | NaN |
| 6 | 2022-07-27 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | driver_lost_control_and_swerved_off_the_bridge | passengers | 6 | NaN |
| 7 | 2022-07-28 | makuyu | makuyu | murang'a | kenol-sagana_road | vehicle_rammed_into_a_lorry | passengers | 7 | NaN |
| 8 | 2022-06-28 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_bridge | passengers | 4 | NaN |
| 9 | 2021-09-21 | river_tungu | river_tungu | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_river | passengers | 5 | NaN |
| 10 | 2021-09-27 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_bridge | passengers | 4 | NaN |
| 15 | 2023-09-08 | mlima_kiu | salama | makueni | mombasa_-_nairobi_highway_ | matatu_collided_head_on_with_an_oncoming_truck | passengers | 4 | NaN |
| 16 | 2023-04-18 | naivasha | delamere_farm | naivasha | nairobi_-_nakuru_highway | head_on_collision_matatu_and_lorry | passengers | 6 | NaN |
df[df['road/highway'] == 'narok_mai_mahiu_road']
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 2 | 2023-07-25 | ntulele | ntulele | narok | narok_mai_mahiu_road | head_on_collision | drivers/occupants | 4 | NaN |
| 3 | 2022-12-02 | suswa | suswa | narok | narok_mai_mahiu_road | head_on_collision | driver_and_passengers | 3 | evening |
| 47 | 2023-08-30 | nairagie-enkare | narok | narok | narok_mai_mahiu_road | bus_burst_into_flames | driver_and_passengers | 0 | NaN |
| 49 | 2023-07-25 | ntulele | narok | narok | narok_mai_mahiu_road | truck_collided_head_on_with_another_truck | passengers | 3 | NaN |
| 50 | 2022-12-02 | suswa | suswa | narok | narok_mai_mahiu_road | head_on_collision | driver_and_passengers | 1 | NaN |
# filling time based on the most frequent time for each 'road/highway'
# Group by 'road/highway' and get the most frequent time for each group
most_frequent_time = df.groupby('road/highway')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
# Fill null values in 'time of accident' with the most frequent time per 'road/highway'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
30
# filling time based on the most frequent time for each 'accident_spot'
# Group by 'accident_spot' and get the most frequent time for each group
most_frequent_time = df.groupby('accident_spot')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
# Fill null values in 'time of accident' with the most frequent time per 'accident_spot'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
25
# filling time based on the most frequent time for each 'area'
# Group by 'area' and get the most frequent time for each group
most_frequent_time = df.groupby('area')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
# Fill null values in 'time of accident' with the most frequent time per 'area'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
24
# filling time based on the most frequent time for each 'county'
# Group by ''county' and get the most frequent time for each group
most_frequent_time = df.groupby('county')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
# Fill null values in 'time of accident' with the most frequent time per 'county'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
21
# Forward fill for the remaining null values
df.loc[:, 'time_of_the_accidents'] = df['time_of_the_accidents'].ffill()
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
0
df = df.reset_index(drop=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 75 entries, 0 to 74 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 75 non-null datetime64[ns] 1 accident_spot 75 non-null object 2 area 75 non-null object 3 county 75 non-null object 4 road/highway 75 non-null object 5 brief_accident_details/cause 75 non-null object 6 victims 75 non-null object 7 total_people_confirmed_dead 75 non-null Int64 8 time_of_the_accidents 75 non-null object dtypes: Int64(1), datetime64[ns](1), object(7) memory usage: 5.5+ KB
EDA (Exploratory Data Analysis)¶
# trying to see the trend of 'total_people_confirmed_dead' across the years
# Create the scatter plot
fig = px.scatter(
df,
x='date', # X-axis: time
y='total_people_confirmed_dead', # Y-axis: total confirmed dead
title='Total People Confirmed Dead Over Time',
labels={'date': 'Date', 'total_people_confirmed_dead': 'Total Confirmed Dead'},
)
# Show the plot
fig.show()
# trend of victims
# Group by 'victims' category and sum the total deaths for each category
victims_death_sum = df.groupby('victims')['total_people_confirmed_dead'].sum().reset_index()
# Sort the DataFrame by total_deaths in descending order
victims_death_sum = victims_death_sum.sort_values(by='total_people_confirmed_dead', ascending=False)
# Create the bar graph
fig = px.bar(
victims_death_sum,
x='victims', # X-axis: categories of victims
y='total_people_confirmed_dead', # Y-axis: sum of total deaths
title='Total Deaths by Victim Category',
labels={'victims': 'Victim Category', 'total_people_confirmed_dead': 'Total Deaths'},
text='total_people_confirmed_dead', # Display total deaths on top of bars
)
# Adjust the layout for better readability
fig.update_layout(
xaxis_tickangle=-45, # Rotate x-axis labels for better readability
yaxis_title='Total Deaths',
xaxis_title='Victim Category',
)
# Show the plot
fig.show()
# trying to see the distribution of dead people values
fig = px.box(df, y='total_people_confirmed_dead')
# Show the figure
fig.show()
# distribution of the accident spots
fig = px.histogram(
df,
x='accident_spot', # X-axis: spot (location type or any other categorical data)
title='Distribution of Accident Spots',
labels={'spot': 'Spot'},
)
# Adjust the layout for better readability
fig.update_layout(
xaxis_title='Spot',
yaxis_title='Count',
xaxis_tickangle=-45 # Rotate x-axis labels if necessary for better readability
)
# Show the plot
fig.show()
Model Training¶
Taking 'risk' as our target variable based on the number of deaths
For high risk (1), the number of deaths is large, above the 3rd quartile
For low risk (0), the number of deaths is small, below the 3rd quartile
# Create the 'risk' target variable
df['risk'] = df['total_people_confirmed_dead'].apply(lambda x: 'High' if x > 5.75 else 'Low')
df[::15]
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | risk | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-08-08 | sobea | sobea | nakuru | nakuru-eldoret_highway | head_on_collision | passengers | 4 | afternoon | Low |
| 15 | 2023-09-07 | voi | ndii | makueni | mombasa_-_nairobi_highway_ | matatu_and_lorry_head_on_collision | passengers | 12 | afternoon | High |
| 30 | 2023-06-24 | kaburengu_junction | chimoi | kakamega | eldoret_webuye_highway | bus_and_lorry_collision | passengers | 6 | night | High |
| 45 | 2023-07-25 | ntulele | narok | narok | narok_mai_mahiu_road | truck_collided_head_on_with_another_truck | passengers | 3 | evening | Low |
| 60 | 2023-08-08 | migori_town | migori_town | migori | migori_isibania_road | truck_failed_brakes_and_ran_into_ither_vehicles | pedestrians | 8 | morning | High |
df.risk = (df.risk == 'High').astype(int)
# Calculate risk counts
risk_counts = df['risk'].value_counts().reset_index()
risk_counts.columns = ['risk', 'count']
# Create a pie chart with hover data
fig = px.pie(
risk_counts,
names='risk', # Categories to group by
values='count', # Values for the pie chart
title='Distribution of Risk Levels',
hover_data={'count': True} # Show the count on hover
)
# Update hover template for clarity
fig.update_traces(
hovertemplate="<b>Risk Level:</b> %{label}<br><b>Count:</b> %{value}<extra></extra>"
)
# Show the plot
fig.show()
The distribution is highly imbalanced. As in, the number of negative cases outweigh the number of positive cases. This would lead to class imbalance problem while fitting our models.
Feature selection¶
# features and target variable
X = df.iloc[:,1:9]
y = df['risk']
'''
# one-hot encoding
from sklearn.feature_extraction import DictVectorizer
# vectorizing our data
feat_dicts = X.to_dict(orient='records')
dv = DictVectorizer(sparse=False) # new instance
dv.fit(feat_dicts) # training it to learn our data
X_feat = dv.fit_transform(feat_dicts)'''
"\n# one-hot encoding\nfrom sklearn.feature_extraction import DictVectorizer\n\n# vectorizing our data\nfeat_dicts = X.to_dict(orient='records')\n\ndv = DictVectorizer(sparse=False) # new instance\n\ndv.fit(feat_dicts) # training it to learn our data\n\nX_feat = dv.fit_transform(feat_dicts)"
# label encoding
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
le = LabelEncoder()
# Apply Label Encoding to each column
X_encoded = X.apply(lambda col: le.fit_transform(col) if col.dtype == 'object' else col)
#To idenfify the features that have larger contribution towards the outcome variable
# Chi-square Test - A statistical test that evaluates the dependence between each feature and the target variable.
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# Apply SelectKBest to extract top 5/8 features
best = SelectKBest(score_func=chi2, k=5)
fit = best.fit(X_encoded, y)
# Prepare scores DataFrame
df_scores = pd.DataFrame(fit.scores_, columns=['Score'])
df_columns = pd.DataFrame(X.columns, columns=['Feature'])
# Join the scores and feature names
scores = pd.concat([df_columns, df_scores], axis=1)
print(scores.nlargest(5, 'Score'))
Feature Score 6 total_people_confirmed_dead 304.464495 1 area 37.269949 4 brief_accident_details/cause 20.062743 3 road/highway 11.870813 7 time_of_the_accidents 2.780837
Setting up validation framework¶
# Define X and Y
x = X_encoded[['area', 'road/highway', 'brief_accident_details/cause', 'total_people_confirmed_dead', 'time_of_the_accidents']]
y = df['risk']
# Split the dataset into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=44)
Training multiple models¶
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Initialize models
models = {
'Logistic Regression': LogisticRegression(),
'K-Nearest Neighbors': KNeighborsClassifier(),
'SVM': SVC(),
'Random Forest': RandomForestClassifier(),
}
# Store predictions
predictions = {}
# Initialize and train each model, then store predictions
for model_name, model in models.items():
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Store predictions
predictions[model_name] = y_pred
Model Evaluation¶
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Initialize an empty list to store performance metrics
model_metrics = []
# For each model, extract precision, recall, f1-score, and accuracy
for model_name, y_pred in predictions.items():
# Generate classification report for the model
report = classification_report(y_test, y_pred, output_dict=True, zero_division=1)
# Extract the metrics for class '0' and '1', and store them along with the model name
metrics = {
'Model': model_name,
'Accuracy': accuracy_score(y_test, y_pred),
'Precision (0)': report['0']['precision'],
'Recall (0)': report['0']['recall'],
'F1-Score (0)': report['0']['f1-score'],
'Precision (1)': report['1']['precision'],
'Recall (1)': report['1']['recall'],
'F1-Score (1)': report['1']['f1-score']
}
# Append the metrics to the list
model_metrics.append(metrics)
# Convert the list of dictionaries into a pandas DataFrame
metrics_df = pd.DataFrame(model_metrics)
# Display the dataframe
metrics_df
| Model | Accuracy | Precision (0) | Recall (0) | F1-Score (0) | Precision (1) | Recall (1) | F1-Score (1) | |
|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.966667 | 1.000000 | 0.956522 | 0.977778 | 0.875 | 1.000000 | 0.933333 |
| 1 | K-Nearest Neighbors | 0.833333 | 0.846154 | 0.956522 | 0.897959 | 0.750 | 0.428571 | 0.545455 |
| 2 | SVM | 0.800000 | 0.793103 | 1.000000 | 0.884615 | 1.000 | 0.142857 | 0.250000 |
| 3 | Random Forest | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000 | 1.000000 | 1.000000 |
Logistic Regression and Random Forest perform very well, especially Random Forest, which achieves perfect results across all metrics.
K-Nearest Neighbors (KNN) and SVM show varying performance. KNN performs well for class 0 but struggles with class 1 (low recall), while SVM has moderate performance but doesn't do as well with class 1.
Random Forest is the best-performing model with 100% accuracy and perfect results for both classes.
Confusion matrix¶
len(df) * 0.4
30.0
Our test data has 30 values
# Create labels for the axes
labels = ['Predicted Low Risk(0)', 'Predicted High risk(1)']
ticks = ['Actual Low Risk(0)', 'Actual High risk(1)']
# Plot confusion matrix for each model
for model_name, model in models.items():
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Plot confusion matrix heatmap
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', square=True,
xticklabels=labels, yticklabels=ticks)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title(f'Confusion Matrix Heatmap - {model_name}')
plt.show()
ROC Curves & AUC¶
from sklearn.metrics import roc_curve, auc
# Initialize the SVC model with probability=True to enable predict_proba
models = {
'Logistic Regression': LogisticRegression(),
'K-Nearest Neighbors': KNeighborsClassifier(),
'SVM': SVC(probability=True), # Enable predict_proba by setting probability=True
'Random Forest': RandomForestClassifier(),
}
# Plot ROC curve for each model
plt.figure(figsize=(10, 10))
for model_name, model in models.items():
# Train the model
model.fit(X_train, y_train)
# Get the predicted probabilities (needed for ROC curve)
y_prob = model.predict_proba(X_test)[:, 1] # Only need the probabilities for class 1
# Compute the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
# Compute the AUC (Area Under the Curve)
auc_score = auc(fpr, tpr)
# Plot the ROC curve
plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_score:.2f})')
# Plot the diagonal (random classifier line)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
# Set the labels and title
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curves for All Models')
plt.legend(loc='lower right')
# Display the plot
plt.show()
Using the model¶
# Extract a row (e.g., row 43)
row = df.iloc[[68]]
row
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | risk | |
|---|---|---|---|---|---|---|---|---|---|---|
| 68 | 2023-06-08 | kanyonyoo_market | kanyonyoo | kitui | thika_kitui_highway | bus_rammed_into_a_saloon_car | passengers | 7 | afternoon | 1 |
# Define the exact features used during training
features = ['area', 'road/highway', 'brief_accident_details/cause', 'total_people_confirmed_dead', 'time_of_the_accidents']
# Apply label encoding to the row
row_encoded = row.copy()
row_encoded = row_encoded.apply(lambda col: le.fit_transform(col) if col.dtype == 'object' else col)
X_test_row = row_encoded[features]
# Make predictions using the trained Random Forest model
random_forest_model = models['Random Forest'] # Using the Random Forest model from the models dictionary
prediction = random_forest_model.predict(X_test_row)[0]
# Compare the predicted value with the actual value
actual_value = row['risk'].iloc[0] # Extracts the only value from the series(w/o index)
print(f"Prediction: {prediction}")
print(f"Actual: {actual_value}")
Prediction: 1 Actual: 1
Model Perfromance¶
# target variable
actual_full_df = df['risk'].values
actual_full_df
array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
0, 1, 1, 1, 0, 0, 0, 0, 0])
# feature matrix
X_encoded = X_encoded[features]
X_encoded
| area | road/highway | brief_accident_details/cause | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|
| 0 | 53 | 25 | 24 | 4 | 0 |
| 1 | 36 | 14 | 56 | 1 | 1 |
| 2 | 43 | 30 | 24 | 4 | 1 |
| 3 | 55 | 30 | 24 | 3 | 1 |
| 4 | 34 | 7 | 48 | 1 | 3 |
| ... | ... | ... | ... | ... | ... |
| 70 | 50 | 23 | 11 | 1 | 0 |
| 71 | 8 | 29 | 17 | 1 | 3 |
| 72 | 16 | 24 | 53 | 1 | 0 |
| 73 | 16 | 26 | 16 | 4 | 0 |
| 74 | 53 | 27 | 37 | 2 | 3 |
75 rows × 5 columns
# predictions
pred_full_df = model.predict(X_encoded)
pred_full_df
array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
0, 1, 1, 1, 0, 0, 0, 0, 0])
# actual vs predictions
act_vs_pred = pd.DataFrame({
'Actual': actual_full_df,
'Predictions': pred_full_df
})
act_vs_pred
| Actual | Predictions | |
|---|---|---|
| 0 | 0 | 0 |
| 1 | 0 | 0 |
| 2 | 0 | 0 |
| 3 | 0 | 0 |
| 4 | 0 | 0 |
| ... | ... | ... |
| 70 | 0 | 0 |
| 71 | 0 | 0 |
| 72 | 0 | 0 |
| 73 | 0 | 0 |
| 74 | 0 | 0 |
75 rows × 2 columns